## packages
library(tidyverse)
library(tidytext)
library(colorspace)
library(sf)
library(maps)
library(albersusa)
library(rgeocodio)
library(geojsonio)
library(rgeos)
library(ggwordcloud)
library(patchwork)
library(pdftools)
library(showtext)
## save plots?
save <- TRUE
#save <- FALSE
## quality of png's
dpi <- 750
## font
font_add_google("Montserrat", "Montserrat")
font_add_google("Overpass", "Overpass")
font_add_google("Overpass Mono", "Overpass Mono")
## theme updates
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 16,
face = "bold",
lineheight = 1.15,
hjust = .5,
margin = margin(10, 0, 25, 0)),
#plot.title.position = "plot",
plot.caption = element_text(color = "grey40",
size = 9,
margin = margin(20, 0, -20, 0)),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = 16,
face = "bold",
margin = margin(t = 20)),
axis.title.y = element_text(size = 16,
face = "bold",
margin = margin(r = 20)),
axis.text = element_text(size = 11,
color = "black",
face = "bold"),
axis.text.x = element_text(margin = margin(t = 10)),
axis.text.y = element_text(margin = margin(r = 10)),
axis.ticks = element_blank(),
panel.grid.major.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(4, "lines"),
panel.spacing.y = unit(2, "lines"),
legend.position = "top",
legend.title = element_text(family = "Montserrat",
color = "black",
size = 14,
margin = margin(5, 0, 5, 0)),
legend.text = element_text(family = "Montserrat",
color = "black",
size = 11,
margin = margin(4.5, 4.5, 4.5, 4.5)),
legend.background = element_rect(fill = NA,
color = NA),
legend.key = element_rect(color = NA, fill = NA),
#legend.key.width = unit(5, "lines"),
#legend.spacing.x = unit(.05, "pt"),
#legend.spacing.y = unit(.55, "pt"),
#legend.margin = margin(0, 0, 10, 0),
strip.text = element_text(face = "bold",
margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_line(size = .6,
color = "#eaeaea"))
## theme settings for maps
theme_map <-
theme_void(base_family = "Montserrat") +
theme(legend.direction = "horizontal",
legend.box = "horizontal",
legend.margin = margin(10, 10, 10, 10),
legend.title = element_text(size = 17,
face = "bold"),
legend.text = element_text(color = "grey33",
size = 12),
plot.margin = margin(15, 5, 15, 5),
plot.title = element_text(face = "bold",
size = 20,
hjust = .5,
margin = margin(30, 0, 10, 0)),
plot.subtitle = element_text(face = "bold",
color = "grey33",
size = 17,
hjust = .5,
margin = margin(10, 0, -30, 0)),
plot.caption = element_text(size = 14,
color = "grey33",
hjust = .97,
margin = margin(-30, 0, 0, 0)))
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")
## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))
## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)df_gd <- readr::read_csv(here::here("raw_data", "Glassdoor - 2020-04-01-0.csv")) %>%
janitor::clean_names() %>%
dplyr::select(-url)df_li <- readr::read_csv(here::here("raw_data", "Linkedin.csv")) %>%
janitor::clean_names() %>%
filter(str_detect(job_title, " seo | SEO")) %>%
mutate(size = as.character(size))## prepare GlassDoord data for joining
df_gd_join <-
df_gd %>%
mutate(
employment_type = case_when(
str_detect(str_to_lower(description), "fulltime|full-time") ~ "Full-time",
str_detect(str_to_lower(description), "parttime|part-time") ~ "Part-time",
TRUE ~ NA_character_
),
seniority = case_when(
str_detect(str_to_lower(description), "internship") ~ "Internship",
str_detect(str_to_lower(job_title), "intern") ~ "Internship",
str_detect(str_to_lower(description), "junior") ~ "Junior",
str_detect(str_to_lower(job_title), "junior") ~ "Junior",
str_detect(str_to_lower(description), "senior") ~ "Senior",
str_detect(str_to_lower(job_title), "senior") ~ "Senior",
str_detect(str_to_lower(description), "entry level|entry-level") ~ "Entry level",
str_detect(str_to_lower(job_title), "mid-senior|mid senior") ~ "Mid-Senior level",
str_detect(str_to_lower(job_title), "director") ~ "Director",
str_detect(str_to_lower(job_title), "executive") ~ "Executive",
TRUE ~ NA_character_
)
) %>%
dplyr::select(
job_title,
employer,
location,
size,
description,
seniority,
employment_type,
industry,
sector
)
###################################################
## all job offers worldwide for global map
df_world <-
df_li %>%
dplyr::select(
job_title,
employer,
location,
size,
description,
seniority,
employment_type,
industry,
"sector" = job_functions
) %>%
full_join(df_gd_join) %>%
mutate(
description = str_replace_all(description, "\\r", " "),
description = str_replace_all(description, "\\n", " "),
description = str_replace_all(description, "\\s+", " ")
) %>%
distinct(job_title, employer, location, description, .keep_all = TRUE)
###################################################
## only keep pages with language == EN
df_li_en <-
df_li %>%
## filter based on
filter(
str_detect(url, "^https://www.") |
str_detect(url, "^https://au.") |
#str_detect(url, "^https://be.") |
str_detect(url, "^https://ca.") |
#str_detect(url, "^https://gh.") |
#str_detect(url, "^https://gr.") |
str_detect(url, "^https://ie.") |
#str_detect(url, "^https://il.") |
#str_detect(url, "^https://in.") |
#str_detect(url, "^https://mg.") |
#str_detect(url, "^https://ng.") |
#str_detect(url, "^https://ph.") |
#str_detect(url, "^https://sg.") |
#str_detect(url, "^https://ua.") |
#str_detect(url, "^https://vn.") |
str_detect(url, "^https://za.")
) %>%
mutate(
country = str_sub(url, start = 1, end = 11),
country = str_remove(country, "https://"),
country = str_remove(country, "\\.")
)
## join data
df_en <-
df_li_en %>%
dplyr::select(
job_title,
employer,
location,
size,
description,
seniority,
employment_type,
industry,
"sector" = job_functions
) %>%
mutate(
sector = case_when(
str_detect(sector, "Business") ~ "Business Services",
str_detect(sector, "Information Technology") ~ "Information Technology",
str_detect(sector, "Consume") ~ "Consumer Services",
str_detect(sector, "Health") ~ "Health Care",
str_detect(sector, "Educat") ~ "Education",
str_detect(sector, "Retail") ~ "Retail",
str_detect(sector, "Insurance") ~ "Insurance",
str_detect(sector, "Media") ~ "Media",
str_detect(sector, "Manufact") ~ "Manufacturing",
str_detect(sector, "Account|Legal") ~ "Accounting & Legal",
str_detect(sector, "Travel|Tourism") ~ "Travel & Tourism",
str_detect(sector, "Biotech|Pharma") ~ "Biotech & Pharmaceuticals",
str_detect(sector, "Restaurant|Bar|Food") ~ "Restaurants, Bars & Food Services",
str_detect(sector, "Transport|Logistic") ~ "Transportation & Logistics",
str_detect(sector, "Construct|Repair|Maintenance") ~ "Construction, Repair & Maintenance",
str_detect(sector, "Finance") ~ "Finance",
str_detect(sector, "Oil|Gas|Energy|Utilit") ~ "Oil, Gas, Energy & Utilities",
str_detect(sector, "ArtsEntertain|Recreat") ~ "Arts, Entertainment & Recreation",
str_detect(sector, "Telecom") ~ "Telecommunications",
str_detect(sector, "Gov") ~ "Government",
TRUE ~ NA_character_
)
) %>%
full_join(df_gd_join) %>%
mutate(
description = str_replace_all(description, "\\r", " "),
description = str_replace_all(description, "\\n", " "),
description = str_replace_all(description, "\\s+", " ")
) %>%
distinct(str_sub(job_title, 1, 15), employer, location, .keep_all = TRUE)
#distinct(job_title, employer, location, description, .keep_all = TRUE) ## description often slightly different (diff: 111 offers)We use two data sets:
The LinkedIn data contain global job offers while the GlassDoor data only jobs from the US. The LinkedIn data including only job offers with the term SEO (or seo) contain 2,387 observations from English-speaking countries (USA, Canada, UK, Ireland, Australia, South Africa) and 552 from the USA and the UK (links starting with www.linkedin.com).
We merged both data sets and kept as many variables as possible, manually creating new variables for both datasets (GlassDoor: seniority and employment type; LinkedIn: sector) based on text matching of job titles and descriptions. We also removed as many duplictaed entries as possible by matching job title, employer and job location. The final worldwide data set contains 3,127 observations.
Because the job offers are collected from all over the world, a lot of foreign terms are included. Thus, we merged the GlassDoor data also with the English subset of the LinkedIn data and kept again as many variables as possible by manually creating new variables for both data sets. The final “All English” data set contains 1,344 observations.
The GlassDoor data are cleaner with regard to job titles and description than the LinkedIn data. Consequently, some plots using the GlassDoor data do a better job so we provide for now both version (the merged “All English” data set and the GlassDoor data set).
Also, the GlassDoor data contain information that are missing from the LinkedIn data such as estimated salary range, rating, employer, industry, and size (no. of employees).
We analysed the data on job titles using text mining techniques. In a first step, we tokenize the job titles into single words and visualize their frequency. Stop words and words that appeared less than 7 times were removed to make the graph easier to grasp.
df_en %>%
unnest_tokens(word, job_title, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = T) %>%
filter(
n >= 25,
!word %in% c("seo")
) %>%
mutate(
word = if_else(
word %in% c("sr", "ppc", "sem"),
str_to_upper(word),
str_to_title(word)
)
) %>%
ggplot(aes(fct_reorder(word, n), n)) +
geom_col(fill = bl_col, width = .8) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = "white",
fontface = "bold",
size = 2.7,
hjust = 1,
nudge_y = -5
) +
coord_flip() +
scale_x_discrete(expand = c(.025, .025)) +
scale_y_continuous(expand = c(.005, .005)) +
labs(
x = NULL,
y = "Term frequence in job titles",
caption = 'Note: Only words with a frequency of 25 or more shown. The term "SEO" was removed.'
) +
theme_flipif(save == T){
ggsave(here::here("plots", "1_1_jobs_word.pdf"), width = 12, height = 8, device = cairo_pdf)
}In a second step, we analysed sequences of words in the job title. The sorted bar plot shows the most popular consecutive sequences of words (5 or more occurences), colored by category.
df_job_ngrams_en <-
df_en %>%
unnest_tokens(word, job_title, token = "ngrams") %>%
anti_join(stop_words) %>%
count(word, sort = T) %>%
filter(
n >= 10,
!is.na(word),
word != "search engine optimization"
) %>%
mutate(
group = case_when(
str_detect(word, "analyst") ~ "Analyst",
str_detect(word, "content") ~ "Content Writer",
str_detect(word, "manager") ~ "Management",
str_detect(word, "market") ~ "Marketing",
str_detect(word, "special") ~ "Specialist",
str_detect(word, "strategist") ~ "Strategy",
str_detect(word, "executive|head") ~ "Executive",
TRUE ~ "Other"
),
word = str_to_title(word),
word = str_replace(word, "Seo", "SEO"),
word = str_replace(word, "Sem", "SEM"),
word = str_replace(word, "Sr", "SR"),
word = str_replace(word, "Ppc", "PPC"),
word = factor(word)
) %>%
mutate(group = fct_relevel(group, "Other", after = Inf))
ggplot(
df_job_ngrams_en,
aes(fct_reorder(word, n), n)) +
geom_col(
aes(fill = group),
width = .8
) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = "white",
fontface = "bold",
size = 3,
hjust = 1,
nudge_y = -.6
) +
coord_flip() +
scale_y_continuous(
expand = c(.01, .01),
limits = c(0, 33)
) +
scale_fill_manual(
name = "Job Category",
values = c("#2d6db4", "#94346E", "#e49e2e", "#633b96", "#96633b", "#00b877", "#cc503e", "grey50")
) +
guides(fill = guide_legend(ncol = 1)) +
labs(
x = NULL,
y = "Frequence of word sequence in job titles",
caption = 'Note: Only sequences with a frequency of 10 or more shown. The sequence "Search Engine Optimization" was removed.'
) +
theme_flip +
theme(
legend.position = c(.76, .45),
legend.key.size = unit(3, "pt"),
legend.key.width = unit(30, "pt")
)if(save == T){
ggsave(here::here("plots", "1_1_jobs_cat.pdf"), width = 12, height = 7.5, device = cairo_pdf)
}We manually classified in technical and non-technical positions, removing all words that are no specific to any of the both categories. This modified stacked bar plot shows the number of words found per job category and, additionally as another stacked bar next to it, the most common words per category (with labels for words that occured at least 20 times). The height of the stacks indicates as well the number, the width is arbitrary.
## technical: analyst, engineer, developer, technician, optimization,
## non-technical: manager, director, writer, consultant, coordinator, editor, marketing, sales, social media
df_stack <-
df_en %>%
unnest_tokens(word, job_title, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = T) %>%
mutate(
type = case_when(
str_detect(word, "analy|develop|technic|data") ~ "technical",
str_detect(word, "manage|direct|writ|consult|coordinat|edito|market|sale|social|strateg|supervis") ~ "non-technical",
TRUE ~ "unknown"
)
) %>%
filter(type != "unknown") %>%
group_by(type) %>%
mutate(sum = sum(n)) %>%
ungroup %>%
arrange(-sum, n) %>%
mutate(
pos = cumsum(n),
pos = if_else(!is.na(lag(pos)), pos - ((pos - lag(pos)) / 2), pos / 2)
) %>%
mutate(type = fct_reorder(factor(type), sum)) %>%
mutate(country = fct_reorder2(factor(word), as.numeric(type), n, .desc = F)) %>%
group_by(type) %>%
arrange(n) %>%
mutate(
alpha = n / max(n),
pos_cont = min(pos) + (max(pos) - min(pos)) / 2
)
df_stack %>%
ggplot(aes(1, n)) +
geom_col(
aes(
fill = type,
fill = after_scale(darken(fill, .05)),
color = type,
color = after_scale(darken(color, .15)),
alpha = alpha
),
size = .1
) +
geom_col(
aes(
fill = type,
color = type,
fill = after_scale(darken(fill, .3)),
color = after_scale(darken(color, .3))
),
width = .4,
size = .1
) +
geom_rect(
xmin = -Inf,
xmax = .8,
ymin = -Inf,
ymax = Inf,
fill = "white"
) +
geom_text(
data = df_stack %>% group_by(type) %>% summarize(pos_cont = unique(pos_cont)),
aes(
x = 1,
y = pos_cont + 35,
label = glue::glue("{type}\ntitle terms")
),
family = "Montserrat",
fontface = "bold",
color = "white",
size = 8,
lineheight = .9,
hjust = .5,
vjust = 0
) +
geom_text(
data = df_stack %>%
group_by(type) %>%
summarize(
pos_cont = unique(pos_cont),
sum = format(unique(sum), big.mark = ",")
),
aes(
x = 1,
y = pos_cont,
label = sum
),
family = "Overpass",
color = "white",
size = 10,
lineheight = .9,
hjust = .5,
vjust = .5
) +
geom_text(
data = df_stack %>% filter(n >= 10),
aes(
x = 1.47,
y = pos,
label = word,
color = type,
color = after_scale(darken(color, .15)),
size = n
),
family = "Montserrat",
fontface = "bold",
hjust = 0
) +
geom_text(
data = df_stack %>% filter(n >= 10),
aes(
x = 1.33,
y = pos,
label = n,
size = n / 2
),
family = "Overpass",
color = "white",
hjust = .5
) +
scale_x_continuous(limits = c(0.5, 2.15)) +
scale_color_manual(
values = c(bl_col, "#8800d1"),
guide = F
) +
scale_fill_manual(
values = c(bl_col, "#8800d1"),
guide = F
) +
scale_alpha(
range = c(.3, 1),
guide = F
) +
scale_size(
range = c(1.5, 15),
guide = F
) +
theme_void()## world map
sf_world <-
st_as_sf(rworldmap::getMap(resolution = "high")) %>%
st_transform(crs = "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs") %>%
dplyr::select(ISO_A3, continent)
## US states
sf_states <-
usa_sf() %>%
filter(!name %in% c("Alaska", "Hawaii")) %>%
st_transform(2163)
## US counties
sf_counties <-
counties_sf() %>%
filter(!state %in% c("Alaska", "Hawaii")) %>%
st_transform(2163)## add geocodes of cities worldwide
path <- here::here("proc_data", "geocodes_world.Rds")
df_loc_world <-
df_world %>%
group_by(location) %>%
count(sort = T) %>%
ungroup()
## returns only addresses from North America
# if(file.exists(path)){
# ## load geocodes or...
# df_geo_world <- readRDS(path)
# }else{
# ## grab and save geocodes
# df_geo_world <-
# gio_batch_geocode(df_loc_world$location) %>%
# unnest(response_results, .preserve = response_warnings) %>%
# dplyr::select(query, formatted_address, location.lat, location.lng) %>%
# group_by(query) %>%
# slice(1)
#
# saveRDS(df_geo_world, path)
# }
## billing needed!
#df_geo <- mutate_geocode(df_loc_world, location)
# sf_map_world <-
# df_loc_world
# left_join(df_geo_world, by = c("location" = "query")) %>%
# filter(!is.na(location.lng)) %>%
# mutate(location.lng = location.lng * 10^5, location.lat = location.lat * 10^5) %>%
# st_as_sf(coords = c("location.lng", "location.lat"),
# crs = st_crs(world)) %>%
# st_transform(st_crs(world)) ## locations English-speaking countries
df_loc_en <-
df_en %>%
group_by(location) %>%
count(sort = T) %>%
ungroup()
## add geocodes of cities in English-speaking countires
path <- here::here("proc_data", "geocodes_en.Rds")
if(file.exists(path)){
## load geocodes or...
df_geo_en <- readRDS(path)
}else{
## grab and save geocodes
df_geo_en <-
gio_batch_geocode(df_loc_en$location) %>%
unnest(response_results, .preserve = response_warnings) %>%
dplyr::select(query, formatted_address, location.lat, location.lng) %>%
group_by(query) %>%
slice(1)
saveRDS(df_geo_en, path)
}
## map data North America
sf_map_us <-
df_loc_en %>%
left_join(df_geo_en, by = c("location" = "query")) %>%
filter(
!is.na(location.lng),
!str_detect(location, "United Kingdom"),
!str_detect(location, "South Africa"),
!str_detect(location, "Australia"),
!str_detect(location, "Ireland"),
!str_detect(location, "Canada")
) %>%
st_as_sf(coords = c("location.lng", "location.lat"),
crs = 4326) %>%
st_transform(st_crs(sf_states)) %>%
st_crop(st_bbox(sf_states))
## long version with 1 row per offer to count spatially
sf_map_us_long <-
df_loc_en %>%
group_by(location) %>%
expand(n = seq(1:n)) %>%
left_join(df_geo_en, by = c("location" = "query")) %>%
filter(
!is.na(location.lng),
!str_detect(location, "United Kingdom"),
!str_detect(location, "South Africa"),
!str_detect(location, "Australia"),
!str_detect(location, "Ireland"),
!str_detect(location, "Canada")
) %>%
st_as_sf(coords = c("location.lng", "location.lat"),
crs = 4326) %>%
st_transform(st_crs(sf_states)) %>%
st_crop(st_bbox(sf_states))# sf_map_world %>%
# arrange(-n) %>%
# ggplot() +
# geom_sf(data = world,
# #color = darken(bl_col, .05),
# color = "grey60",
# alpha = .3,
# lwd = 5) +
# geom_sf(data = world,
# color = "white",
# fill = "#cedbd7",
# lwd = .5) +
# geom_sf(aes(size = n),
# color = "white",
# show.legend = "point") +
# geom_sf(aes(size = n),
# shape = 21,
# color = darken(bl_col, .2),
# fill = NA,
# stroke = .4,
# show.legend = "point") +
# geom_sf(aes(size = n, color = n),
# color = bl_col,
# alpha = .1,
# show.legend = "point") +
# scale_size(range = c(3, 30),
# breaks = c(1, 10, 25, 50, 100),
# name = "Number of job offers") +
# guides(size = guide_legend(title.position = "top",
# title.hjust = .5,
# nrow = 1,
# label.position = "bottom",
# override.aes = list(shape = 21, color = bl_col, fill = lighten(bl_col, .9), stroke = 1))) +
# theme_map +
# theme(legend.position = c(.2, .1),
# legend.title = element_text(margin = margin(0, 0, -5, 0)),
# legend.text = element_text(margin = margin(-10, 0, 0, 0)))
#
# if(save == T){
# ggsave(here::here("plots", "2_1_map_world_cities.pdf"),
# width = 15, height = 9.7, device = cairo_pdf)
# }sf_map_us %>%
arrange(-n) %>%
ggplot() +
# geom_sf(
# data = world,
# #color = darken(bl_col, .05),
# color = "grey60",
# alpha = .2,
# lwd = 3
# ) +
geom_sf(
data = sf_world,
color = "grey60",
fill = lighten("#374b45", .15),
lwd = .4
) +
geom_sf(
data = sf_world %>% filter(ISO_A3 == "USA"),
color = "grey60",
fill = "#374b45",
lwd = .4
) +
geom_sf(
aes(size = n),
color = "white",
show.legend = "point"
) +
geom_sf(
aes(size = n),
shape = 21,
color = darken(bl_col, .2),
fill = NA,
stroke = .1,
show.legend = "point"
) +
geom_sf(
aes(
size = n,
color = n
),
color = bl_col,
alpha = .1,
show.legend = "point"
) +
coord_sf(
xlim = c(-2300000, 2900000),
ylim = c(-2200000, 1000000)
) +
scale_size(
range = c(1, 25),
breaks = c(1, 10, 50, 100),
name = "Number of\njob offers"
) +
guides(
size = guide_legend(
title.position = "top",
title.hjust = .5,
nrow = 1,
label.position = "bottom",
override.aes = list(
shape = 21,
color = bl_col,
fill = lighten(bl_col, .9),
stroke = 1
)
)
) +
theme_map +
theme(
legend.position = c(.1, .12),
legend.title = element_text(color = "grey90", margin = margin(0, 0, -5, 0)),
legend.text = element_text(color = "grey90", margin = margin(-5, 0, 0, 0)),
panel.background = element_rect(fill = "grey30"),
panel.border = element_rect(color = "grey60", fill = "transparent", size = 4.5),
panel.grid.major = element_line(color = "grey45", size = .4, linetype = "dashed"),
plot.margin = margin(0, 0, 0, 0)
)sf_map_us %>%
arrange(-n) %>%
ggplot() +
geom_sf(
data = sf_states,
#color = darken(bl_col, .05),
color = "grey60",
alpha = .3,
lwd = 5
) +
geom_sf(
data = sf_states,
color = "#cedbd7",#"white",
fill = darken("#cedbd7", .15, space = "HLS"),
lwd = .5
) +
geom_sf(
aes(size = n),
color = "white",
show.legend = "point"
) +
geom_sf(
aes(size = n),
shape = 21,
color = darken(bl_col, .2),
fill = NA,
stroke = .4,
show.legend = "point"
) +
geom_sf(
aes(size = n, color = n),
color = bl_col,
alpha = .1,
show.legend = "point"
) +
scale_size(
range = c(3, 30),
breaks = c(1, 10, 25, 50, 100),
name = "Number of job offers"
) +
guides(
size = guide_legend(
title.position = "top",
title.hjust = .5,
nrow = 1,
label.position = "bottom",
override.aes = list(
shape = 21,
color = bl_col,
fill = lighten(bl_col, .9),
stroke = 1
)
)
) +
theme_map +
theme(
legend.position = c(.2, .1),
legend.title = element_text(margin = margin(0, 0, -5, 0)),
legend.text = element_text(margin = margin(-10, 0, 0, 0))
)df_cities <-
df_loc_en %>%
left_join(df_geo_en, by = c("location" = "query")) %>%
filter(
!is.na(location.lng),
!str_detect(location, "United Kingdom"),
!str_detect(location, "South Africa"),
!str_detect(location, "Australia"),
!str_detect(location, "Ireland"),
!str_detect(location, "Canada")
) %>%
filter(n >= 10) %>%
mutate(
state = str_extract(location, "[^, ]*$"),
state_lump = case_when(
state == "CA" ~ "California",
state == "NY" ~ "New York",
TRUE ~ "other"
)
)
df_cities %>%
ggplot(aes(n, fct_reorder(location, n))) +
geom_col(
aes(fill = state_lump),
orientation = "y",
width = .85
) +
geom_text(
data = df_cities %>% filter(n > 35),
aes(label = state_lump),
family = "Montserrat",
fontface = "bold",
color = "white",
hjust = 1,
nudge_x = -3
) +
scale_x_continuous(
breaks = seq(0, 150, by = 25),
expand = c(.001, .001)
) +
scale_fill_manual(
values = c(darken(bl_col, .25), bl_col, "grey60"),
guide = F
) +
labs(
x = "Number of job offers",
y = NULL,
caption = 'Note: Only cities with 10 or more job offers shown.'
)## centroids for labels
sf_states_count <-
sf_states %>%
mutate(
pt_count = lengths(st_intersects(sf_states, sf_map_us_long)),
pt_count = if_else(pt_count == 0, NA_integer_, pt_count)
)
centroids <-
sf_states_count %>%
st_centroid()
sf_states_count %>%
ggplot() +
geom_sf(
aes(fill = pt_count),
color = "grey80",
size = .6
) +
#geom_sf(data = sf_map_en_long) +
rcartocolor::scale_fill_carto_c(
palette = "Emrld",
na.value = "grey96",
breaks = c(1, seq(25, 200, by = 25)),
name = "Number of job offers"
) +
geom_sf_text(
data = sf_states_count %>% filter(name != "Maryland", pt_count < 100),
aes(label = pt_count),
family = "Overpass",
fontface = "bold"
) +
geom_sf_text(
data = sf_states_count %>% filter(name != "Maryland", pt_count >= 100),
aes(label = pt_count),
family = "Overpass",
fontface = "bold",
color = "white"
) +
geom_sf_text(
data = sf_states_count %>% filter(name == "Maryland"),
aes(label = pt_count),
family = "Overpass",
fontface = "bold",
nudge_y = 35000
) +
guides(fill = guide_colorbar(
title.position = "top",
title.hjust = .5,
label.position = "bottom",
barwidth = unit(30, "lines"),
barheight = unit(.6, "lines"))
) +
theme_map +
theme(legend.position = c(.5, .95))if(save == T){
ggsave(here::here("plots", "2_2_map_states_chloro.pdf"),
width = 15, height = 9.7, device = cairo_pdf)
}(This map is derived from spatial locations by intersecting cities with state polygons - thus slightly different numbers compared to the hexagonal grid map which uses states as stated by the source.)
df_states_join <-
df_loc_en %>%
mutate(location = str_replace(location, ", United States|, US", "")) %>%
mutate(state_mixed = str_extract(location, "[^, ]*$"))
sf_states_count <-
sf_states %>%
left_join(df_states_join, by = c("iso_3166_2" = "state_mixed")) %>%
filter(!is.na(geo_id)) %>%
group_by(name) %>%
summarize(n = sum(n, na.rm = T)) %>%
mutate(n = if_else(n == 0, NA_integer_, n)) %>%
ungroup()
## centroids for labels
centroids <-
sf_states_count %>%
st_centroid()
sf_states_count %>%
ggplot() +
geom_sf(
aes(fill = n),
color = "grey80",
size = .6
) +
#geom_sf(data = sf_map_en_long) +
rcartocolor::scale_fill_carto_c(
palette = "Emrld",
na.value = "grey96",
breaks = c(1, seq(25, 200, by = 25)),
name = "Number of job offers"
) +
geom_sf_text(
data = sf_states_count %>% filter(name != "Maryland", n < 100),
aes(label = n),
family = "Overpass",
fontface = "bold"
) +
geom_sf_text(
data = sf_states_count %>% filter(name != "Maryland", n >= 100),
aes(label = n),
family = "Overpass",
fontface = "bold",
color = "white"
) +
geom_sf_text(
data = sf_states_count %>% filter(name == "Maryland"),
aes(label = n),
family = "Overpass",
fontface = "bold",
nudge_y = 35000
) +
guides(fill = guide_colorbar(
title.position = "top",
title.hjust = .5,
label.position = "bottom",
barwidth = unit(30, "lines"),
barheight = unit(.6, "lines"))
) +
theme_map +
theme(legend.position = c(.5, .95))## data by states
df_states <-
readr::read_csv(here::here("raw_data", "50_us_states_world_data.csv"),
col_names = F) %>%
dplyr::select(state = "X2", ISO2 = "X3")%>%
add_row(state = "District of Colombia", ISO2 = "DC")
df_states_join <-
df_loc_en %>%
mutate(location = str_replace(location, ", United States|, US", "")) %>%
mutate(state_mixed = str_extract(location, "[^, ]*$")) %>%
left_join(df_states, by = c("state_mixed" = "ISO2")) %>%
mutate(state = if_else(is.na(state), state_mixed, state)) %>%
group_by(state) %>%
summarize(n = sum(n, na.rm = T)) %>%
ungroup()
## hex map
map_hex <- geojson_read(here::here("raw_data", "us_states_hexgrid.geojson.json"), what = "sp")
map_hex@data <-
map_hex@data %>%
mutate(google_name = gsub(" \\(United States\\)", "", google_name))
map_hex_fortified <- tidy(map_hex, region = "google_name")
## combine data
df_hex <-
map_hex_fortified %>%
left_join(df_states_join, by = c("id" = "state"))
## centroids for labels
centroids <- cbind.data.frame(data.frame(gCentroid(map_hex, byid = T),
id = map_hex@data$iso3166_2,
id_long = map_hex@data$google_name,
id_wrap = str_wrap(map_hex@data$google_name, 12))) %>%
left_join(df_hex, by = c("id_long" = "id")) %>%
group_by(id) %>%
slice(1) %>%
replace_na(list(n = 0))df_hex %>%
replace_na(list(n = 0)) %>%
ggplot() +
geom_polygon(
aes(
long, lat,
group = group,
fill = n
),
color = darken(bl_col, .1),
#color = "grey60",
lwd = .8
) +
geom_text(
data = centroids,
aes(
x = x,
y = y + 0.35,
label = n,
alpha = n
),
family = "Montserrat",
fontface = "bold",
size = 7.5,
color = darken(bl_col, .6),
vjust = .2
) +
geom_text(
data = centroids,
aes(
x = x,
y = y - 0.3,
label = id_wrap
),
family = "Montserrat",
fontface = "bold",
size = 3.2,
lineheight = 0.85,
vjust = 1
) +
coord_map() +
scale_fill_gradient(
low = "grey95",
high = darken(bl_col, .1),
name = "Number of job offers",
limits = c(0, 200),
breaks = seq(0, 200, by = 25)
) +
scale_alpha(
range = c(.25, 1),
guide = F
) +
guides(fill = guide_colorbar(
barheight = unit(5, units = "mm"),
barwidth = unit(150, units = "mm"),
direction = "horizontal",
ticks.colour = "#e8d8c3",
title.position = "top",
title.hjust = 0.5)
) +
theme_map +
theme(
legend.position = c(.5, .9),
legend.text = element_text(size = 14)
)sf_counties %>%
mutate(
pt_count = lengths(st_intersects(sf_counties, sf_map_us_long)),
## note indicates max value of New York since too small to see
pt_count = if_else(name == "New York", 60L, pt_count),
pt_count = if_else(pt_count == 0, NA_integer_, pt_count),
) %>%
ggplot() +
geom_sf(
aes(fill = pt_count),
color = "grey80",
size = .2
) +
geom_sf(
data = sf_states,
fill = NA,
color = "grey50",
size = .3
) +
#geom_sf(data = sf_map_en_long) +
geom_segment(
x = 1600000,
xend = 2142000,
y = 250000,
yend = -125000,
arrow = arrow(length = unit(0.015, "npc")),
color = "grey40",
size = .5
) +
geom_text(
x = 1600000,
y = 276000,
label = "New York:\n155 job offers",
family = "Montserrat",
color = "grey40",
size = 3.5,
lineheight = .9,
vjust = 0
) +
rcartocolor::scale_fill_carto_c(
palette = "Emrld",
na.value = "grey96",
name = "Number of job offers",
breaks = c(1, seq(10, 60, by = 10)),
limits = c(NA, 60)
) +
guides(
fill = guide_colorbar(
title.position = "top",
title.hjust = .5,
label.position = "bottom",
barwidth = unit(30, "lines"),
barheight = unit(.6, "lines")
)
) +
theme_map +
theme(legend.position = c(.5, .95))if(save == T){
ggsave(here::here("plots", "2_3_map_counties_chloro.pdf"),
width = 15, height = 9.7, device = cairo_pdf)
}(This map is derived from spatial locations by intersecting cities with county polygons - thus there might be small differences compared to the state-level maps.)
df_size <-
df_gd %>%
dplyr::select(employer, size, description) %>%
mutate(
size = if_else(is.na(size), "Unknown", size),
size = str_replace(size, " employees", ""),
size = str_replace(size, " to ", "–"),
size = factor(size,
levels = c("1–50", "51–200", "201–500", "501–1000",
"1001–5000", "5001–10000", "10000+", "Unknown"),
labels = c("1–50", "51–200", "201–500", "501–1,000",
"1,001–5,000", "5,001–10,000", "10,000+", "Unknown"),
)
)-> Counts of unique companies per size class
df_size %>%
group_by(size) %>%
summarize(n = n_distinct(employer)) %>%
mutate(col = if_else(size == "Unknown", "A", "B")) %>%
ggplot(aes(size, n)) +
geom_col(
aes(
fill = col,
color = after_scale(darken(fill, .4))
),
width = .85,
size = 1
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, 195)
) +
scale_fill_manual(
values = c("grey60", bl_col),
guide = F
) +
labs(
x = "Number of employees",
y = "Number of companies"
) +
theme(panel.grid.major.x = element_blank())if(save == T){
ggsave(here::here("plots", "3_1_size_histo.pdf"),
width = 12, height = 8, device = cairo_pdf)
}CED: What do we define as “more specialized tasks”?
df_revenue <-
df_gd %>%
dplyr::select(revenue, employer, description) %>%
mutate(
revenue = if_else(revenue == "Unknown / Non-Applicable", "Unknown", revenue),
revenue = if_else(is.na(revenue), "Unknown", revenue),
revenue = str_replace(revenue, " \\(USD\\)", ""),
revenue = str_replace(revenue, " million", "M"),
revenue = str_replace(revenue, " billion", "B"),
revenue = str_replace(revenue, " to \\$", "–"),
revenue = str_replace(revenue, "Less than", "<"),
revenue = if_else(revenue == "$10+B", "> $10B", revenue),
revenue = factor(revenue,
levels = c("< $1M", "$1–5M", "$5–10M", "$10–25M",
"$25–50M", "$50–100M", "$100–500M", "$500M–1B",
"$1–2B", "$2–5B", "$5–10B", "> $10B",
"Unknown"))
)-> Counts of unique companies per revenue class
df_revenue %>%
group_by(revenue) %>%
summarize(n = n_distinct(employer)) %>%
filter(revenue != "Unknown") %>%
ggplot(aes(revenue, n)) +
geom_col(
fill = bl_col,
color = after_scale(darken(bl_col, .4, space = "HLS")),
width = .8,
size = 1
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, NA)
) +
labs(
x = "Number of employees",
y = "Revenue"
) +
theme(
panel.grid.major.x = element_blank(),
axis.text.x = element_text(size = 10)
)if(save == T){
ggsave(here::here("plots", "3_2_revenue_histo.pdf"),
width = 12, height = 8, device = cairo_pdf)
}I tokenized the description and removed stop words and numbers as well as manually non-sense/non-skill-related words. There might be more but if we keep it we can have a closer look I would say.
df_rev_skills <-
df_revenue %>%
unnest_tokens(word, description, token = "words") %>%
anti_join(stop_words) %>%
filter(
!word %in% c("seo","experience", "skills", "skill", "ability", "requirement", "requirements", "link", "required", "provide", "day", "email", "based", "growth", "recommendations", "employment", "paid", "payment", "key", "gender", "equal", "applicants", "application", "disability", "disabilities", "world", "qualifications", "multiple", "page", "pages", "related", "site", "candidate", "insurance", "company", "agency", "office", "position"),
!str_detect(word, "[0-9+]")
) %>%
mutate(word = if_else(word == "teams", "team", word))
set.seed(2020)
cloud_low <-
df_rev_skills %>%
filter(revenue %in% c("< $1M", "$1–5M", "$5–10M", "$10–25M",
"$25–50M", "$50–100M")) %>%
count(word, sort = T) %>%
top_n(75, n) %>%
ggplot(aes(label = word, size = n, color = n)) +
geom_text_wordcloud(
family = "Montserrat",
fontface = "bold",
shape = "square",
grid_margin = 2.5
) +
rcartocolor::scale_color_carto_c(palette = "Emrld") +
scale_size_area(max_size = 7) +
labs(title = "\nCompanies with revenues lower than $100M") +
theme_minimal()
cloud_high <-
df_rev_skills %>%
filter(revenue %in% c("$100–500M", "$500M–1B",
"$1–2B", "$2–5B", "$5–10B", "> $10B")) %>%
count(word, sort = T) %>%
top_n(75, n) %>%
ggplot(aes(label = word, size = n, color = n)) +
geom_text_wordcloud(
family = "Montserrat",
fontface = "bold",
shape = "square",
grid_margin = 2.5
) +
rcartocolor::scale_color_carto_c(palette = "Emrld") +
scale_size_area(max_size = 7) +
labs(title = "\nCompanies with revenues higher than $100M") +
theme_minimal()
## vertical allignment
(cloud_low / cloud_high) *
theme(plot.title = element_text(family = "Montserrat", size = 14, face = "bold",
hjust = .5, margin = margin(b = -30)))if(save == T){
ggsave(here::here("plots", "3_2_revenue_words_vertical.pdf"),
width = 11, height = 8, device = cairo_pdf)
}
## horizontal allignment
(cloud_low + cloud_high) *
theme(plot.title = element_text(family = "Montserrat", size = 14, face = "bold",
hjust = .5, margin = margin(b = -15)))df_gd %>%
filter(!is.na(sector)) %>%
group_by(sector) %>%
count(sort = T) %>%
ungroup() %>%
mutate(sector = fct_reorder(factor(sector), n)) %>%
ggplot(aes(sector, n)) +
geom_col(
width = .8,
fill = bl_col
) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = "grey40",
fontface = "bold",
hjust = 0,
nudge_y = 3,
size = 2.7
) +
coord_flip() +
scale_y_continuous(
expand = c(.01, .01),
limits = c(0, 345),
breaks = seq(0, 300, by = 50)
) +
labs(
x = NULL,
y = "Number of job offers by sector"
) +
theme_flipif(save == T){
ggsave(here::here("plots", "3_3_sector_counts.pdf"),
width = 12, height = 6.5, device = cairo_pdf)
}df_gd %>%
filter(!is.na(industry)) %>%
group_by(industry) %>%
count(sort = T) %>%
filter(n > 4) %>%
ungroup() %>%
mutate(industry = fct_reorder(factor(industry), n)) %>%
ggplot(aes(industry, n)) +
geom_col(
width = .8,
fill = bl_col
) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = "grey40",
fontface = "bold",
hjust = 0,
nudge_y = 3,
size = 2.7
) +
coord_flip() +
scale_y_continuous(
expand = c(.01, .01),
limits = c(0, 265),
breaks = seq(0, 250, by = 50)
) +
labs(
x = NULL,
y = "Number of job offers by industry",
caption = "Note: Only industries with 5 or more job offers shown."
) +
theme_flip-> Counts of unique companies per rating
df_gd %>%
group_by(ratings) %>%
summarize(n = n_distinct(employer)) %>%
ggplot(aes(ratings, n)) +
geom_segment(
aes(
xend = ratings,
yend = 0
),
color = darken(bl_col, .2),
size = 1.5
) +
geom_point(
fill = bl_col,
color = darken(bl_col, .2),
shape = 21,
size = 5,
stroke = .5
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, 40)
) +
labs(
x = "Number of employees",
y = "Number of companies"
) +
theme(panel.grid.major.x = element_blank())if(save == T){
ggsave(here::here("plots", "3_4_rating_lolli.pdf"),
width = 12, height = 8, device = cairo_pdf)
}df_gd %>%
ungroup() %>%
mutate(rating_floor = floor(ratings - .01)) %>%
group_by(rating_floor) %>%
summarize(n = n_distinct(employer)) %>%
ggplot(aes(rating_floor, n)) +
geom_col(
fill = bl_col,
color = darken(bl_col, .2)
) +
scale_x_continuous(
breaks = 0:4,
labels = c("< 1", "1–2", "2–3", "3–4", "> 4")
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, NA)
) +
labs(
x = "Rating range",
y = "Number of companies"
) +
theme(panel.grid.major.x = element_blank())df_tasks_words <-
df_gd %>%
unnest_tokens(word, description, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = T) %>%
filter(
nchar(word) > 2,
!str_detect(word, "[0-9]+")
)
df_tasks_words %>%
filter(
word != "seo",
n >= 10
) %>%
ggplot(
aes(
label = word,
size = n,
color = n
)
) +
geom_text_wordcloud(
family = "Montserrat",
fontface = "bold",
shape = "square",
grid_margin = 3
) +
rcartocolor::scale_color_carto_c(palette = "ag_Sunset") +
scale_size_area(max_size = 20) +
theme_minimal()if(save == T){
ggsave(here::here("plots", "4_cloud_word.pdf"),
width = 25, height = 15, device = cairo_pdf)
}
# df_tasks_sequ <-
# df_gd %>%
# unnest_tokens(word, description, token = "ngrams") %>%
# count(word, sort = T) %>%
# filter(
# !word %in% c(
# "search engine optimization",
# "as well as",
# "is a plus",
# "up to date",
# "the ability to",
# "ability to work",
# "an equal opportunity",
# "be responsible for",
# "are looking for",
# "working knowledge of",
# "you will be",
# "years of experience",
# "we are looking",
# "looking for a",
# "in order to",
# "be able to",
# "in a fast",
# "engine optimization seo",
# "work closely with",
# "is an equal",
# "will be responsible",
# "to join our"
# ),
# n >= 10
# )
#
# df_tasks_sequ %>%
# filter(
# word != "seo",
# n >= 10
# ) %>%
# ggplot(aes(label = word, size = n, color = n)) +
# geom_text_wordcloud(
# family = "Montserrat",
# fontface = "bold",
# shape = "square"
# ) +
# rcartocolor::scale_color_carto_c(palette = "ag_Sunset") +
# scale_size_area(max_size = 20) +
# theme_minimal()
#
# if(save == T){
# ggsave(here::here("plots", "4_cloud_sequ.pdf"),
# width = 25, height = 20, device = cairo_pdf)
# }df_size_edu <-
df_size %>%
mutate(
bachelors = if_else(str_detect(description, "B.Ba.|B.Sc.|BBa|BSc|BBA|BSC|Bachelors"), 1, 0),
masters = if_else(str_detect(description, "M.Ba.|M.Sc.|MBa|MSc|MBA|MSC|Masters"), 1, 0),
doctorate = if_else(str_detect(description, "Ph.D.|PhD|Doctorate"), 1, 0)
) %>%
dplyr::select(size, bachelors, masters, doctorate) %>%
pivot_longer(
cols = c(bachelors, masters, doctorate),
names_to = "education",
values_to = "yes_no"
) %>%
group_by(size) %>%
mutate(
total = n(),
education = str_to_title(education),
education = factor(education, levels = c("Doctorate", "Masters", "Bachelors"))
)df_size_edu %>%
group_by(education) %>%
summarize(n = sum(yes_no)) %>%
ggplot(aes(education, n)) +
geom_col(
fill = bl_col,
color = after_scale(darken(bl_col, .4, space = "HLS")),
width = .8,
size = 1.2
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, 42)
) +
labs(
x = NULL,
y = "Number of mentions"
) +
theme(
panel.grid.major.x = element_blank(),
axis.text.x = element_text(size = 18)
)if(save == T){
ggsave(here::here("plots", "5_1_require_edu_histo.pdf"),
width = 12, height = 7.5, device = cairo_pdf)
}Bachelors: 38 Masters: 10 Doctorate: 1
df_size_edu %>%
group_by(size, education) %>%
summarize(rel = sum(yes_no) / unique(total)) %>%
ungroup() %>%
ggplot(
aes(
size, rel,
fill = education
)
) +
geom_col(width = .8) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, .031),
labels = scales::percent_format()
) +
scale_fill_manual(
values = cols_degree,
name = "Required degree:"
) +
guides(fill = guide_legend(reverse = T)) +
theme(
legend.text = element_text(size = 14),
panel.grid.major.x = element_blank()
) +
labs(
x = "Number of employees",
y = "Percentage"
)if(save == T){
ggsave(here::here("plots", "5_1_require_edu_size.pdf"),
width = 12, height = 7.5, device = cairo_pdf)
}df_rev_edu <-
df_revenue %>%
mutate(
bachelors = if_else(str_detect(description, "B.Ba.|B.Sc.|BBa|BSc|BBA|BSC|Bachelors"), 1, 0),
masters = if_else(str_detect(description, "M.Ba.|M.Sc.|MBa|MSc|MBA|MSC|Masters"), 1, 0),
doctorate = if_else(str_detect(description, "Ph.D.|PhD|Doctorate"), 1, 0)
) %>%
dplyr::select(revenue, bachelors, masters, doctorate) %>%
pivot_longer(
cols = c(bachelors, masters, doctorate),
names_to = "education",
values_to = "yes_no"
) %>%
group_by(revenue) %>%
mutate(
total = n(),
education = str_to_title(education),
education = factor(education, levels = c("Doctorate", "Masters", "Bachelors"))
) %>%
group_by(revenue, education) %>%
summarize(rel = sum(yes_no) / unique(total)) %>%
ungroup()
ggplot(df_rev_edu,
aes(
revenue, rel,
fill = education)
) +
geom_col(width = .8) +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
scale_y_continuous(
expand = c(.0005, .0005),
limits = c(0, .031),
labels = scales::percent_format()
) +
scale_fill_manual(
values = cols_degree,
name = "Required degree:"
) +
guides(fill = guide_legend(reverse = T)) +
theme(
legend.text = element_text(size = 14),
panel.grid.major.x = element_blank()
) +
labs(
x = "Estimated revenue",
y = "Percentage"
)if(save == T){
ggsave(here::here("plots", "5_1_require_edu_revenue_dodge.pdf"),
width = 12, height = 6.5, device = cairo_pdf)
}
ggplot(df_rev_edu,
aes(
revenue, rel,
fill = education
)) +
geom_col(width = .8) +
scale_y_continuous(
expand = c(.0005, .0005),
limits = c(0, .031),
labels = scales::percent_format()
) +
scale_fill_manual(
values = cols_degree,
name = "Required degree:"
) +
guides(fill = guide_legend(reverse = T)) +
theme(
legend.text = element_text(size = 14),
panel.grid.major.x = element_blank(),
axis.text.x = element_text(size = 9)
) +
labs(
x = "Estimated revenue",
y = "Percentage"
)if(save == T){
ggsave(here::here("plots", "5_1_require_edu_revenue_small.pdf"),
width = 12, height = 6.5, device = cairo_pdf)
}
ggplot(df_rev_edu, aes(revenue, rel,
fill = education)) +
geom_col(width = .8) +
scale_y_continuous(
expand = c(.0005, .0005),
limits = c(0, .031),
labels = scales::percent_format()
) +
scale_fill_manual(
values = cols_degree,
name = "Required degree:"
) +
guides(fill = guide_legend(reverse = T)) +
theme(
legend.text = element_text(size = 14),
panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 22, hjust = 1, vjust = 1)
) +
labs(
x = "Estimated revenue",
y = "Percentage"
)I for now use the programming languages listed by the SO yearly survey: JavaScript, HTML/CSS, SQL, Python, Java, Bash/Shell/PowerShell, C#, PHP, C++, TypeScript, C, Ruby, Go, Assembly, Swift, Kotlin, R, VBA, Objective-C, Scala, Rust, Dart, Elixir, Clojure, WebAssembly + Julia
df_prog <-
df_gd %>%
mutate(
description = str_to_lower(description),
JavaScript = str_count(description, "\\bjavascript\\b"),
HTML = str_count(description, " html\\b"),
CSS = str_count(description, "\\bcss\\b"),
SQL = str_count(description, "\\bsql\\b"),
Python = str_count(description, "\\bpython\\b"),
Java = str_count(description, "\\bjava\\b"),
Bash = str_count(description, "\\bbash\\b"),
Shell = str_count(description, "\\bshell\\b"),
Powershell = str_count(description, "\\bpowershell\\b"),
`C#` = str_count(description, "\\bc#\\b"),
PHP = str_count(description, "\\bphp\\b"),
`C++` = str_count(description, "\\bc\\b\\+\\+"),
TypeScript = str_count(description, "\\btypescript\\b"),
C = str_count(description, "\\bc\\b"),
Ruby = str_count(description, "\\bruby\\b"),
Go = str_count(description, "\\bgo\\b"),
Assembly = str_count(description, "\\bassembly\\b"),
Swift = str_count(description, "\\bswift\\b"),
Kotlin = str_count(description, "\\bkotlin\\b"),
R = str_count(description, "\\br\\b"),
VBA = str_count(description, "\\bvba\\b"),
ObjectiveC = str_count(description, "\\bobjective-c\\b|\\bobjective\\sc\\b"),
Scala = str_count(description, "\\bscala\\b"),
Rust = str_count(description, "\\brust\\b"),
Dart = str_count(description, "\\bdart\\b"),
Elixir = str_count(description, "\\belixir\\b"),
Clojure = str_count(description, "\\bCcojure\\b"),
WebAssembly = str_count(description, "\\bwebassembly\\b"),
Julia = str_count(description, "\\bjulia\\b")
)df_prog %>%
dplyr::select(JavaScript:Julia) %>%
pivot_longer(
cols = JavaScript:Julia,
names_to = "language",
values_to = "count"
) %>%
filter(count > 0) %>%
group_by(language) %>%
summarize(count = n()) %>%
ungroup() %>%
mutate(language = fct_reorder(language, count)) %>%
ggplot(aes(language, count)) +
geom_segment(
aes(
xend = language,
yend = 0
),
color = bl_col,
size = 3
) +
geom_point(
shape = 21,
color = bl_col,
fill = "white",
size = 10,
stroke = 1
) +
geom_text(
aes(label = count),
family = "Overpass Mono",
color = bl_col,
fontface = "bold",
size = 3.3,
hjust = .5,
nudge_y = 0
) +
coord_flip() +
scale_y_continuous(
expand = c(.02, .02),
limits = c(0, 340),
breaks = seq(0, 300, by = 50)
) +
theme_flip +
theme(axis.text.y = element_text(size = 14)) +
labs(
x = NULL,
y = "Number of job descriptions mentioning each programming language"
)if(save == T){
ggsave(here::here("plots", "5_2_require_prog.pdf"),
width = 12, height = 7, device = cairo_pdf)
}df_prog %>%
dplyr::select(JavaScript:Julia) %>%
mutate(id = row_number()) %>%
pivot_longer(
cols = JavaScript:Julia,
names_to = "language",
values_to = "count"
) %>%
group_by(id) %>%
mutate(sum = sum(count)) %>%
filter(
sum > 1,
count > 0
) %>%
arrange(id) %>%
pivot_wider(
id_cols = id,
names_from = language,
values_from = count
) %>%
ungroup() %>%
dplyr::select(-id) %>%
mutate_all(funs(ifelse(. == 1, deparse(substitute(.)), NA))) %>%
unite("combination", sep = " + ", remove = T, na.rm = T) %>%
group_by(combination) %>%
count() %>%
ungroup() %>%
filter(
combination != "",
str_detect(combination, "\\+"),
n > 2
) %>%
mutate(combination = fct_reorder(combination, n)) %>%
ggplot(aes(combination, n)) +
geom_segment(
aes(
xend = combination,
yend = 0
),
color = bl_col,
size = 2
) +
geom_point(
shape = 21,
color = bl_col,
fill = "white",
size = 8,
stroke = 1
) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = bl_col,
fontface = "bold",
size = 3.5,
hjust = .5,
nudge_y = 0
) +
coord_flip() +
scale_y_continuous(
expand = c(.02, .02),
breaks = seq(0, 70, by = 10),
limits = c(0, 72)
) +
theme_flip +
labs(
x = NULL,
y = "Mentioned combinations of programming\nlanguages in job descriptions",
caption = 'Note: Only combinations with a frequency of 3 or more shown.'
)tools <- str_to_lower(c(
"Bing Webmaster Tools",
"Botify",
"Bright Local",
"Browseo",
"Clusteric",
"ContentKing App",
"DareBoost",
"DeepCrawl",
"EasyRedir",
"Forecheck",
"Google Analytics",
"Google Mobile-Friendly Test",
"Google PageSpeed Insights",
"Google Search Console",
"Google XML Sitemaps",
"GTmetrix",
"HeadMasterSEO",
"LinkPatrol",
"Lipperhey",
"OnCrawl",
"Panguin Tool",
"Raven Tools",
"Screaming Frog",
"Seobility",
"Seomator",
"SERPmetrics",
"Siteliner",
"Topvisor",
"Varvy SEO Tool",
"Whitespark",
"Woorank",
"Yoast",
"Zadroweb",
"Answer The Public",
"ClearScope",
"Exploding Topics",
"FAQfox",
"Google Keyword Planner",
"Google Location ",
"Google Trends",
"Gookey",
"GrepWords",
"HitTail",
"Imforsmb",
"iSpionage",
"Jaaxy",
"Keyword Eye",
"Keyword Revealer",
"Keyword Snatcher",
"Keyworddit",
"KeywordIn",
"Keywords Everywhere",
"KeywordSpy",
"KeywordTool.io",
"Kombinator",
"kwfinder",
"Long Tail Pro",
"Power Suggest Pro",
"QuestionDB",
"SanityCheck",
"SECockpit",
"Seed Keywords",
"SEMrush",
"SERPStat",
"SimilarWeb",
"Soovle",
"SpyFu",
"StoryBase",
"TermExplorer",
"TwinWord",
"UberSuggest",
"Webtexttool",
"Wondersearch",
"Wordstream's Free Keyword Tools",
"WordTracker",
"Wordtracker Scout",
"Advanced Web Ranking",
"Agency Analytics",
"AMZ Tracker",
"Authority Labs",
"GeoRanker",
"Microsite Masters",
"NightWatch",
"Pro Rank Tracker",
"Rank Ranger",
"Rival IQ",
"SE Ranking",
"Search Latte",
"Serpfox",
"SERPs.com",
"SERPWoo",
"Sistrix",
"WebCEO",
"WordTail",
"Animalz Revive",
"BuzzSumo",
"Can I Rank",
"ClickFlow",
"Google SERP Preview Tool",
"Keys4Up",
"LSIGraph",
"MarketMuse",
"MetaTags.io",
"nTopic",
"Positionly",
"Ryte",
"SEOptimer",
"TrendSpottr",
"Upcity",
"WordLift",
"Ahrefs",
"cognitiveSEO",
"Kerboo",
"Majestic ",
"Moz",
"MozBar",
"SEO PowerSuite",
"SEOGadget ",
"ShareMetric",
"URL Profiler",
"WebMeUp Backlink Tool",
"Morningfame",
"Social Blade",
"TubeBuddy",
"VidIQ",
"YTCockpit",
"AuthoritySpy",
"Buzzstream",
"DIBZ",
"disavow.it",
"Domain Hunter Plus",
"GroupHigh",
"HARO ",
"JustReachOut",
"Linkbird",
"Linkody",
"Linkstant",
"MailShake",
"Muck Rack",
"Ninja",
"Ontolo",
"Pitchbox",
"Remove'em",
"Rmoov",
"ScrapeBox",
"tableau",
"qlik",
"power bi"
))df_en %>%
mutate(description = str_to_lower(description)) %>%
rowwise() %>%
mutate(n_tools = sum(str_detect(description, tools))) %>%
count(n_tools) %>%
ggplot(aes(n_tools, n)) +
geom_col(fill = bl_col, width = .8) +
geom_text(
aes(label = n),
family = "Montserrat",
color = bl_col,
fontface = "bold",
size = 4,
nudge_y = 10
) +
scale_x_continuous(
breaks = 0:10,
labels = c("None", "1 tool", glue::glue("{2:10} tools")),
expand = c(.001, .001)
) +
scale_y_continuous(
breaks = NULL,
expand = c(.01, .01)
) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
axis.line.y = element_blank()
) +
labs(
x = "Number of tools mentioned in job descriptions",
y = NULL
)df_en %>%
mutate(
description = str_to_lower(description),
exp = str_extract(description, "[0-9]+ years experience|1 year experience|experience: [0-9]+ year|experience [0-9]+ year|experience of [0-9]+ year"),
exp = as.numeric(str_extract(exp, "[0-9]+")),
) %>%
filter(!is.na(exp)) %>%
count(exp) %>%
ggplot(aes(exp, n)) +
geom_col(
fill = bl_col,
width = .9
) +
scale_x_continuous(
breaks = c(1:6, 10, 30),
expand = c(.01, .01)
) +
scale_y_continuous(
expand = c(.01, .01)
) +
labs(
x = "Years of experience needed",
y = "Number of job offers"
) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)if(save == T){
ggsave(here::here("plots", "5_4_require_experience.pdf"),
width = 12, height = 8, device = cairo_pdf)
}df_salary <-
df_gd %>%
dplyr::select(salary, employer, ratings, description, location) %>%
filter(!is.na(salary)) %>%
mutate(
salary_low = str_extract(salary, "\\$.*\\-"),
salary_high = str_extract(salary, "\\-\\$.*"),
) %>%
mutate_at(vars(matches("salary_")), ~str_extract(., "[0-9]+")) %>%
mutate_at(vars(matches("salary_")), as.numeric) %>%
mutate(
salary_avg = salary_low + salary_high / 2,
salary_class = salary_avg %/% 10 * 10
)df_salary %>%
ggplot(aes(x = salary_avg)) +
geom_histogram(
fill = bl_col,
color = darken(bl_col, .4),
bins = 50
) +
geom_vline(
aes(xintercept = mean(salary_avg)),
linetype = "dashed",
color = "grey35",
size = 1
) +
annotate(
"text",
x = 100,
y = 47,
label = glue::glue("Mean: ${round(mean(df_salary$salary_avg), 0)}K"),
family = "Montserrat",
color = "grey35",
fontface = "bold",
size = 4.5
) +
scale_x_continuous(
limits = c(1, 300),
breaks = c(1, seq(50, 300, by = 50)),
labels = glue::glue("${c(1, seq(50, 300, by = 50))}K")
) +
scale_y_continuous(
expand = c(0, 0)
) +
scale_fill_manual(
values = c("grey60", bl_col),
guide = F
) +
labs(
x = "Average salary",
y = "Number of job offers"
) +
theme(panel.grid.major.x = element_blank())df_salary_cities <-
df_salary %>%
group_by(location) %>%
summarize(
avg = mean(salary_avg),
n = n()
) %>%
filter(n >= 5) %>%
mutate(
state = str_extract(location, "[^, ]*$"),
state_lump = case_when(
state == "CA" ~ "California",
state == "TX" ~ "Texas",
TRUE ~ "other"
)
) %>%
arrange(-avg)
df_salary_cities %>%
ggplot(aes(avg, fct_reorder(location, avg))) +
geom_col(
aes(fill = state_lump),
orientation = "y",
width = .85
) +
geom_text(
data = df_salary_cities %>% slice(c(1, 8)),
aes(label = state_lump),
family = "Montserrat",
fontface = "bold",
color = "white",
hjust = 1,
nudge_x = -1.5
) +
ggtext::geom_richtext(
aes(label = glue::glue("**${round(avg, 1)}K** (n = {n})")),
family = "Montserrat",
color = "grey40",
size = 3,
fill = NA,
label.color = NA,
hjust = 0,
nudge_x = .5
) +
scale_x_continuous(
breaks = seq(0, 100, by = 25),
labels = glue::glue("${seq(0, 100, by = 25)}K"),
expand = c(.001, .001),
limits = c(0, 130)
) +
scale_fill_manual(
values = c(darken(bl_col, .25), "grey60", bl_col),
guide = F
) +
labs(
x = "Average salary",
y = NULL,
caption = 'Note: Only cities with 5 or more offers including salary estimations shown.'
) +
theme(panel.grid.major.y = element_blank())if(save == T){
ggsave(here::here("plots", "6_2_salary_cities.pdf"),
width = 12, height = 8, device = cairo_pdf)
}df_salary_states <-
df_salary %>%
mutate(
state = str_extract(location, "[^, ]*$"),
state_lump = case_when(
state == "CA" ~ "California",
state == "TX" ~ "Texas",
TRUE ~ "other"
)
) %>%
group_by(state) %>%
summarize(
avg = mean(salary_avg),
min = min(salary_avg),
max = max(salary_avg),
sd = sd(salary_avg),
n = n()
) %>%
filter(n >= 10) %>%
arrange(-avg)
df_salary_states %>%
ggplot(aes(avg, fct_reorder(state, avg))) +
geom_linerange(
aes(
xmin = min,#avg - sd,
xmax = max,#avg + sd
),
color = bl_col,
size = 1.2
) +
geom_point(
color = darken(bl_col, .2, space = "HLS"),
size = 7
) +
geom_text(
aes(label = glue::glue("${round(avg, 1)}K")),
family = "Montserrat",
color = darken(bl_col, .2, space = "HLS"),
fontface = "bold",
size = 3,
nudge_x = -9,
nudge_y = .3
) +
geom_text(
aes(label = glue::glue("(n = {n})")),
family = "Montserrat",
color = darken(bl_col, .2, space = "HLS"),
size = 3,
fill = NA,
label.color = NA,
hjust = 0,
nudge_x = 4,
nudge_y = .3
) +
scale_x_continuous(
breaks = seq(0, 300, by = 50),
labels = glue::glue("${seq(0, 300, by = 50)}K")
) +
labs(
x = "Average salary",
y = NULL,
caption = 'Note: Only states with 10 or more offers including salary estimations shown.\nPoints show the average estimated salary, lines indicate the range of salaries per state.'
)df_salary %>%
mutate(
description = str_to_lower(description),
prog = str_detect(description, c("\\bjavascript\\b", "\\bhtml\\b", "\\bcss\\b", "\\bsql\\b",
"\\bpython\\b", "\\bjava\\b", "\\bbash\\b", "\\bshell\\b",
"\\bpowershell\\b", "\\bc#\\b", "\\bphp\\b", "\\bc\\b\\+\\+",
"\\btypescript\\b", "\\bc\\b", "\\bruby\\b", "\\bgo\\b",
"\\bassembly\\b", "\\bswift\\b", "\\bkotlin\\b", "\\br\\b",
"\\bvba\\b", "\\bobjective-c\\b", "\\bobjective\\sc\\b",
"\\bscala\\b", "\\brust\\b", "\\bdart\\b", "\\belixir\\b",
"\\bCcojure\\b", "\\bwebassembly\\b", "\\bjulia\\b"))
) %>%
dplyr::select(salary_avg, prog) %>%
group_by(prog) %>%
summarize(
avg = mean(salary_avg),
med = median(salary_avg),
sd = sd(salary_avg)
) %>%
mutate(hjust = if_else(prog == F, 1.3, -.3)) %>%
ggplot(aes(prog, avg)) +
geom_linerange(
aes(
ymin = avg - sd,
ymax = avg + sd
),
color = "grey40",
size = 1.2
) +
geom_line(
aes(group = 1),
color = "grey40",
size = .4,
linetype = "dashed"
) +
geom_point(
color = "grey40",
size = 7
) +
geom_line(
aes(
y = med,
group = 1
),
color = bl_col,
size = .4,
linetype = "dashed"
) +
geom_point(
aes(y = med),
size = 3,
stroke = 2,
shape = 4,
color = bl_col
) +
geom_text(
aes(
label = glue::glue("${round(avg, 1)}K"),
hjust = hjust
),
family = "Montserrat",
color = "grey40",
fontface = "bold",
size = 4.5,
nudge_y = 3
) +
geom_text(
aes(
y = med,
label = glue::glue("${round(med, 1)}K"),
hjust = hjust
),
family = "Montserrat",
color = bl_col,
fontface = "bold",
size = 4.5,
nudge_y = -2
) +
annotate(
"text",
x = 1.5,
y = 78,
label = "Average",
family = "Montserrat",
color = "grey40",
fontface = "bold",
size = 4.5
) +
annotate(
"text",
x = 1.5,
y = 69,
label = "Median",
family = "Montserrat",
color = bl_col,
fontface = "bold",
size = 4.5
) +
scale_x_discrete(
labels = c("No", "Yes"),
expand = c(.2, .2)
) +
scale_y_continuous(
breaks = seq(40, 120, by = 20),
labels = glue::glue("${seq(40, 120, by = 20)}K"),
limits = c(40, NA)
) +
labs(
x = "Programming language\nmentioned as requirement?",
y = "Estimated salary",
caption = "The line shows the standard deviation from the estimated average salaries."
) +
theme(panel.grid.major.x = element_blank())df_salary %>%
filter(!is.na(ratings)) %>%
ggplot(aes(ratings, salary_avg)) +
geom_smooth(
#method = "lm",
color = darken(bl_col, .2),
se = F
) +
geom_point(
shape = 1,
size = 4,
color = bl_col,
stroke = .2
) +
geom_point(
size = 4,
alpha = .1,
color = bl_col
) +
scale_y_continuous(
breaks = seq(50, 300, by = 50),
labels = glue::glue("${seq(50, 300, by = 50)}K"),
limits = c(50, 300)
) +
labs(
x = "Company rating",
y = "Average salary (calculated from range)"
)if(save == T){
ggsave(here::here("plots", "6_5_rating_salary.pdf"),
width = 12, height = 7.5, device = cairo_pdf)
}df_dates <-
df_li_en %>%
mutate(
year = lubridate::year(posted_on),
week = lubridate::week(posted_on),
wday = lubridate::wday(posted_on),
wday_lab = lubridate::wday(posted_on, label = T, abbr = F),
week_cum = week + (year - 2018) * 52
) %>%
filter(year != 2018)Old job offers may not be scraped since they were filled already and removed from/by LinkedIn?! I would not investigate anything with this data at all…
df_dates %>%
count(week_cum, year) %>%
ggplot(aes(week_cum, n)) +
geom_col(aes(fill = as.factor(year)), width = 1.05) +
scale_x_continuous(expand = c(.005, .005)) +
scale_y_continuous(expand = c(.01, .01)) +
scale_fill_manual(
values = c(bl_col, darken(bl_col, .2, space = "HLS")),
name = NULL
) +
labs(
x = "Date posted in weeks",
y = "Number of job offers on LinkedIn.com"
) +
theme(
axis.text.x = element_blank(),
legend.position = c(.3, .7),
legend.direction = "horizontal"
)if(save == T){
ggsave(here::here("plots", "7_1_dates_weeks_all.pdf"),
width = 12, height = 6.5, device = cairo_pdf)
}df_dates %>%
filter(
year == 2020,
str_detect(url, "https://www.linkedin.com"),
!str_detect(location, "United Kingdom|UK|England|Wales|Ireland")
) %>%
count(week) %>%
ggplot(aes(week, n)) +
geom_col(
aes(fill = week < 9),
width = .85
) +
scale_x_continuous(
breaks = 1:52,
expand = c(.01, .01)
) +
scale_y_continuous(
expand = c(.01, .01),
breaks = seq(0, 200, by = 20)
) +
scale_fill_manual(
values = c("#b30086", bl_col),
guide = F
) +
labs(
x = "Week posted",
y = "Number of job offers on LinkedIn.com",
caption = "Note: In week 9 the number of confirmed COID-19 infections reached 100 in the US and the bars are thus colored pink afterwards. Each bar shows only US job posting on LinkedIn."
) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)df_dates %>%
count(wday, wday_lab) %>%
ggplot(aes(fct_reorder(wday_lab, wday), n)) +
geom_col(
width = .85,
fill = bl_col
) +
scale_y_continuous(expand = c(.01, .01)) +
labs(
x = "Weekday posted",
y = "Number of job offers on LinkedIn.com"
)## Converting page 1 to 1_1_jobs_cat_1.png... done!
## Converting page 1 to 1_1_jobs_tech_adj_1.png... done!
## Converting page 1 to 1_1_jobs_word_1.png... done!
## Converting page 1 to 2_1_bars_cities_1.png... done!
## Converting page 1 to 2_1_bars_cities_10+_1.png... done!
## Converting page 1 to 2_1_map_northamerica_cities_1.png... done!
## Converting page 1 to 2_1_map_states_cities_1.png... done!
## Converting page 1 to 2_2_map_states_chloro_1.png... done!
## Converting page 1 to 2_2_map_states_chloro2_1.png... done!
## Converting page 1 to 2_2_map_states_hex_1.png... done!
## Converting page 1 to 2_3_map_counties_chloro_1.png... done!
## Converting page 1 to 3_1_size_histo_1.png... done!
## Converting page 1 to 3_2_revenue_histo_1.png... done!
## Converting page 1 to 3_2_revenue_words_horizontal_1.png... done!
## Converting page 1 to 3_2_revenue_words_vertical_1.png... done!
## Converting page 1 to 3_3_industry_counts_1.png... done!
## Converting page 1 to 3_3_sector_counts_1.png... done!
## Converting page 1 to 3_4_rating_histo_1.png... done!
## Converting page 1 to 3_4_rating_lolli_1.png... done!
## Converting page 1 to 4_cloud_sequ_1.png... done!
## Converting page 1 to 4_cloud_word_1.png... done!
## Converting page 1 to 5_1_require_edu_histo_1.png... done!
## Converting page 1 to 5_1_require_edu_revenue_angle_1.png... done!
## Converting page 1 to 5_1_require_edu_revenue_dodge_1.png... done!
## Converting page 1 to 5_1_require_edu_revenue_small_1.png... done!
## Converting page 1 to 5_1_require_edu_size_1.png... done!
## Converting page 1 to 5_2_require_prog_1.png... done!
## Converting page 1 to 5_2_require_prog_comb_1.png... done!
## Converting page 1 to 5_3_require_tools_1.png... done!
## Converting page 1 to 5_4_require_experience_1.png... done!
## Converting page 1 to 6_1_salary_histo_1.png... done!
## Converting page 1 to 6_2_salary_cities_1.png... done!
## Converting page 1 to 6_2_salary_states_1.png... done!
## Converting page 1 to 6_4_salary_req_prog_1.png... done!
## Converting page 1 to 6_5_rating_salary_1.png... done!
## Converting page 1 to 7_1_dates_weeks_2020_1.png... done!
## Converting page 1 to 7_1_dates_weeks_all_1.png... done!
## Converting page 1 to 7_2_dates_weekday_1.png... done!